#install.packages("tidycensus")
# Load necessary libraries
library(tidycensus)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
#install.packages("factoextra")
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#install.packages("ggplot2")
#install.packages("usmap")
#install.packages("maps")
library(ggplot2)
library(usmap)
library(maps)
#Abhay
#Employment - K202301 for 2021
# Get ACS data
df1 <- get_acs(geography = "state",
table = "K202301",
year = 2021,
survey = "acs1-year",
cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K202301 and caching the dataset for faster future access.
variables_df1 <- data.frame(
variable = c("K202301_001", "K202301_002", "K202301_003", "K202301_004", "K202301_005", "K202301_006", "K202301_007"),
label = c("Total", "In labor force:", "Civilian labor force:", "Employed",
"Unemployed", "In Armed Forces",
"Not in labor force")
)
# Join with variable labels
# Now df_labeled will have an additional column with the variable labels
# Load variables for ACS 2022
# Sample mapping of variable codes to labels
# Replace this with the actual codes and labels
# Assuming df1 is your data frame with the estimates you've loaded
# Make sure to replace 'NAME' and 'estimate' with the actual column names from your df1
# Join your data with the variable labels to get the descriptive names
df1_labeled <- df1 %>%
left_join(variables_df1, by = "variable")
# Reshape the data so that state names are rows and variable labels are columns
df1_wide <- df1_labeled %>%
pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
# df_wide now has states as columns and the descriptive variable labels as rows
# Calculate employment rates
df1_wide <- df1_wide %>%
mutate(
EmploymentRate = `Employed` / `Total` * 100,
UnemploymentRate = `Unemployed` / `Total` * 100,
NotInLaborForceRate = `Not in labor force` / `Total` * 100
)
# Plotting employment rates
library(ggplot2)
ggplot(df1_wide, aes(x = reorder(NAME, -EmploymentRate), y = EmploymentRate)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(x = "State", y = "Employment Rate (%)", title = "Employment Rates by State") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Plotting labor force participation
ggplot(df1_wide, aes(x = NAME)) +
geom_bar(aes(y = `Employed`, fill = "Employed"), stat = "identity") +
geom_bar(aes(y = `Unemployed`, fill = "Unemployed"), stat = "identity") +
geom_bar(aes(y = `In Armed Forces`, fill = "Armed Forces"), stat = "identity") +
scale_fill_manual(values = c("Employed" = "green", "Unemployed" = "red", "Armed Forces" = "blue")) +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "State", y = "Number of People", fill = "Category", title = "Labor Force Participation by State")
ggplot(df1_wide, aes(x = UnemploymentRate)) +
geom_histogram(binwidth = 1, fill = "tomato", color = "black") +
labs(x = "Unemployment Rate (%)", y = "Number of States", title = "Distribution of Unemployment Rates across States")
# Correlation analysis
employment_data <- df1_wide %>%
select(`Employed`, `Unemployed`, `In Armed Forces`, `Not in labor force`)
correlation_matrix <- cor(employment_data, use = "complete.obs")
# Visualize the correlation matrix
library(corrplot)
## corrplot 0.92 loaded
corrplot(correlation_matrix, method = "circle")
# Calculate employment rate
# Assuming df1_wide or df6_wide has a column 'state' with state names or abbreviations
# Calculate the metric you want to plot (for example, Employment Rate)
# Assuming df1_wide has full state names in the 'NAME' column
# Convert state names to abbreviations
state_abbreviations <- state.abb[match(df1_wide$NAME, state.name)]
df1_wide$state <- state_abbreviations
# Ensure you have calculated the metric you want to plot (e.g., Employment Rate)
df1_wide$EmploymentRate <- df1_wide$`Employed` / df1_wide$Total * 100
# Plot the map with employment rate
plot_usmap(data = df1_wide, values = "EmploymentRate", labels = TRUE) +
scale_fill_continuous(name = "Employment Rate (%)", label = scales::percent_format()) +
theme(legend.position = "right") +
labs(title = "Employment Rate by State in 2021")
#Neha
#Education - K201501
df2 <- get_acs(geography = "state",
table = "K201501",
year = 2021,
survey = "acs1/subject",cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K201501 and caching the dataset for faster future access.
variables_df2 <- data.frame(
variable = c("K201501_001", "K201501_002", "K201501_003", "K201501_004", "K201501_005", "K201501_006", "K201501_007", "K201501_008"),
label = c("Education_Total_students", "Education_Below_9th grade", "Education_9th to 12th grade_no diploma", "Education_High_school_graduate", "Education_Some college_no degree", "Education_Associates_degree", "Education_Bachelors_degree", "Education_Graduate_professional degree")
)
df2_labeled <- df2 %>%
left_join(variables_df2, by = "variable")
df2_wide <- df2_labeled %>%
pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
Exploratory Data Analysis (EDA)
# Ensure necessary libraries are installed and loaded
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("tidyr")) install.packages("tidyr")
library(ggplot2)
library(tidyr)
# Convert df2_wide to long format
df2_long <- df2_wide %>%
gather(key = "Education_Level", value = "Count", -NAME)
# Plotting the distribution of each education level across states
ggplot(df2_long, aes(x = NAME, y = Count, fill = Education_Level)) +
geom_bar(stat = "identity", position = position_dodge()) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Distribution of Education Levels across States",
x = "State",
y = "Count")
Exploratory Data Analysis (EDA) based on percentage
# Ensure necessary libraries are installed and loaded
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("tidyr")) install.packages("tidyr")
if (!require("dplyr")) install.packages("dplyr")
library(ggplot2)
library(tidyr)
library(dplyr)
# Convert df2_wide to long format
df2_long <- df2_wide %>%
gather(key = "Education_Level", value = "Count", -NAME)
# Calculate total count for each state
df2_long <- df2_long %>%
group_by(NAME) %>%
mutate(Total = sum(Count)) %>%
ungroup()
# Calculate the percentage
df2_long <- df2_long %>%
mutate(Percentage = (Count / Total) * 100)
# Plotting the distribution of each education level across states as a percentage
ggplot(df2_long, aes(x = NAME, y = Percentage, fill = Education_Level)) +
geom_bar(stat = "identity", position = position_dodge()) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Percentage Distribution of Education Levels across States",
x = "State",
y = "Percentage")
library(ggplot2)
library(tidyr)
library(dplyr)
# Convert df2_wide to long format
df2_long <- df2_wide %>%
gather(key = "Education_Level", value = "Count", -NAME)
# Calculate total count for each state only once
total_counts <- df2_long %>%
group_by(NAME) %>%
summarize(Total = sum(Count))
# Join with original data to add the Total column
df2_long <- df2_long %>%
left_join(total_counts, by = "NAME")
# Calculate the percentage for each education level
df2_long <- df2_long %>%
mutate(Percentage = (Count / Total) * 100)
# Create graphs for each educational level
unique_levels <- unique(df2_long$Education_Level)
plots <- list()
for (level in unique_levels) {
df_subset <- filter(df2_long, Education_Level == level)
p <- ggplot(df_subset, aes(x = NAME, y = Percentage, fill = Education_Level)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = paste("Percentage Distribution of", level, "across States"),
x = "State",
y = "Percentage") +
guides(fill = guide_legend(title = "Education Level"))
plots[[level]] <- p
}
# Print plots (optional)
for (plot in plots) {
print(plot)
}
df2_percentages <- df2_wide %>%
mutate(
`Education_Below_9th grade_Percentage` = `Education_Below_9th grade` / Education_Total_students * 100,
`Education_9th to 12th grade_no diploma_Percentage` = `Education_9th to 12th grade_no diploma` / Education_Total_students * 100,
Education_High_school_graduate_Percentage = Education_High_school_graduate / Education_Total_students * 100,
`Education_Some college_no degree_Percentage` = `Education_Some college_no degree` / Education_Total_students * 100,
Education_Associates_degree_Percentage = Education_Associates_degree / Education_Total_students * 100,
Education_Bachelors_degree_Percentage = Education_Bachelors_degree / Education_Total_students * 100,
`Education_Graduate_professional degree_Percentage` = `Education_Graduate_professional degree` / Education_Total_students * 100
)
percentage_columns <- grep("_Percentage$", names(df2_percentages), value = TRUE)
df2_long <- df2_percentages %>%
pivot_longer(cols = percentage_columns, names_to = "Age_Group", values_to = "Percentage")
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(percentage_columns)
##
## # Now:
## data %>% select(all_of(percentage_columns))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(df2_long, aes(x = reorder(NAME, -Education_Total_students), y = Percentage, fill = Age_Group)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "Age Distribution by State",
x = "State",
y = "Percentage of Total Population",
fill = "Age Group") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
scale_fill_viridis_d() # Adjust the color scale as needed
#Esha
#Citizenship - K200501
df3<- get_acs(
geography = "state",
table = "K200501",
year = 2021,
survey = "acs1/subject",
cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K200501 and caching the dataset for faster future access.
variables_df3 <- data.frame(
variable = c("K200501_001", "K200501_002","K200501_003"),
label = c("Total","U.S. citizen", "Not a U.S. citizen")
)
df3_labeled <- df3 %>%
left_join(variables_df3, by = "variable")
df3_wide <- df3_labeled %>%
pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
# Load necessary libraries
library(ggplot2)
library(tidyr)
library(dplyr)
# Convert df3_wide to long format
df3_long <- df3_wide %>%
gather(key = "Citizenship", value = "Count", -NAME)
# Calculate total count for each state
df3_long <- df3_long %>%
group_by(NAME) %>%
mutate(Total = sum(Count)) %>%
ungroup()
# Calculate the percentage
df3_long <- df3_long %>%
mutate(Percentage = (Count / Total) * 100)
# Iterate over each unique citizenship category
for (category in unique(df3_long$Citizenship)) {
# Filter data for the current citizenship category
df3_subset <- df3_long %>%
filter(Citizenship == category)
# Generate and print the plot for citizenship
p <- ggplot(df3_subset, aes(x = NAME, y = Percentage, fill = Citizenship)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = paste("Percentage Distribution of", category, "across States"),
x = "State",
y = "Percentage")
print(p)
}
## Warning: Removed 1 rows containing missing values (`position_stack()`).
## Warning: Removed 1 rows containing missing values (`position_stack()`).
## Warning: Removed 1 rows containing missing values (`position_stack()`).
#Srika
#Age - K200104
df4 <- get_acs(geography = "state",
table = "K200104",
year = 2021,
survey = "acs1/subject",cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K200104 and caching the dataset for faster future access.
variables_df4 <- data.frame(
variable = c("K200104_001", "K200104_002", "K200104_003", "K200104_004", "K200104_005", "K200104_006", "K200104_007", "K200104_008"),
label = c("Total_age", "Age_under_18",
"Age_18_to_24", "Age_25_to_34",
"Age_35_to_44", "Age_45_to_54", "Age_55_to_64","Age_over_64")
)
df4_labeled <- df4 %>%
left_join(variables_df4, by = "variable")
df4_wide <- df4_labeled %>%
pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
df4_percentages <- df4_wide %>%
mutate(
Age_under_18_Percentage = Age_under_18 / Total_age * 100,
Age_18_to_24_Percentage = Age_18_to_24 / Total_age * 100,
Age_25_to_34_Percentage = Age_25_to_34 / Total_age * 100,
Age_35_to_44_Percentage = Age_35_to_44 / Total_age * 100,
Age_45_to_54_Percentage = Age_45_to_54 / Total_age * 100,
Age_55_to_64_Percentage = Age_55_to_64 / Total_age * 100,
Age_over_64_Percentage = Age_over_64 / Total_age * 100
)
percentage_columns <- grep("_Percentage$", names(df4_percentages), value = TRUE)
df4_long <- df4_percentages %>%
pivot_longer(cols = percentage_columns, names_to = "Age_Group", values_to = "Percentage")
ggplot(df4_long, aes(x = reorder(NAME, -Total_age), y = Percentage, fill = Age_Group)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "Age Distribution by State",
x = "State",
y = "Percentage of Total Population",
fill = "Age Group") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
scale_fill_viridis_d() # Adjust the color scale as needed
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ lubridate 1.9.2 ✔ stringr 1.5.0
## ✔ purrr 1.0.2 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::map() masks maps::map()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
Age_data <- get_acs(geography = "state",
table = "K200104",
year = 2021,
geometry = TRUE,
resolution = "20m",
survey = "acs1/subject",cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Loading ACSSE variables for 2021 from table K200104 and caching the dataset for faster future access.
##
|
| | 0%
|
|====== | 9%
|
|============ | 17%
|
|================== | 26%
|
|======================== | 35%
|
|=============================== | 44%
|
|===================================== | 52%
|
|=========================================== | 61%
|
|================================================= | 70%
|
|======================================================= | 79%
|
|============================================================= | 87%
|
|======================================================================| 100%
Mapping_age<-Age_data%>%
filter(variable=="K200104_007")%>%
shift_geometry()
ggplot(data = Mapping_age, aes(fill = estimate)) +
geom_sf()+
scale_fill_distiller(palette = "RdPu",
direction = 1) +
labs(title = "Median Age by State, 2021",
caption = "Data source: 2021 1-year ACS, US Census Bureau",
fill = "ACS estimate") +
theme_void()
#Niharika
# Housing - K202502
df5 <- get_acs(geography = "state",
table = "K202502",
year = 2021,
survey = "acs1", # removed /subject since we're referring to a specific table ID
cache_table = TRUE)
## Getting data from the 2021 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K202502 and caching the dataset for faster future access.
# Define the variable codes and their respective labels for the housing dataset (df5)
variables_df5 <- data.frame(
variable = c("K202502_001", "K202502_002", "K202502_003"),
label = c("Total", "Owner Occupied", "Renter Occupied")
)
# Assuming df5 is your housing dataset
# Reshape the data so that state names are rows
df5_labeled <- df5 %>%
left_join(variables_df5, by = "variable") # Joining with the variable labels dataframe
# Reshape the data to have state names as rows and variable labels as columns
df5_wide <- df5_labeled %>%
pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
# df5_wide will have states as rows and different housing-related variables as columns
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
# Assuming df5_wide is your dataset for housing
# Melt the data to long format for easy plotting
df5_long <- tidyr::gather(df5_wide, key = "Housing_Category", value = "Count", -NAME)
# Calculate the percentage for Owner and Renter Occupied categories
df5_long <- df5_long %>%
group_by(NAME) %>%
mutate(Total = sum(Count[which(Housing_Category == "Total")])) %>%
ungroup() %>%
mutate(Percentage = ifelse(Housing_Category != "Total", Count / Total * 100, Count))
# Define colors for each category
category_colors <- c("Owner Occupied" = "steelblue", "Renter Occupied" = "salmon", "Total" = "lightgreen")
# Function to create a bar plot
create_bar_plot <- function(data, category, title) {
ggplot(data[data$Housing_Category == category, ], aes(x = reorder(NAME, -Percentage), y = Percentage, fill = Housing_Category)) +
geom_bar(stat = "identity", position = position_dodge()) +
labs(x = "State", y = "Percentage", title = title) +
scale_y_continuous(labels = percent_format(scale = 1)) +
scale_fill_manual(values = category_colors) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
}
# Plot for Owner Occupied
owner_plot <- create_bar_plot(df5_long, "Owner Occupied", "Percentage of Owner Occupied Housing by State")
print(owner_plot)
# Plot for Renter Occupied
renter_plot <- create_bar_plot(df5_long, "Renter Occupied", "Percentage of Renter Occupied Housing by State")
print(renter_plot)
#install.packages("tidycensus")
# Load necessary libraries
library(tidycensus)
library(dplyr)
library(tidyr)
#install.packages("factoextra")
library(factoextra)
#install.packages("ggplot2")
#install.packages("usmap")
#install.packages("maps")
library(ggplot2)
library(usmap)
library(maps)
#Abhay
#Disabilities - K201803
# Get ACS data for the Disabilities dataset
df6 <- get_acs(geography = "state",
table = "K201803",
year = 2021,
survey = "acs1-year",
cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K201803 and caching the dataset for faster future access.
# Assuming the variables_df6 mapping is similar to variables_df1 but for the Disabilities table
# You need to create variables_df6 with the correct variable codes and labels for the Disabilities data
variables_df6 <- data.frame(
variable = c("K201803_001", "K201803_002", "K201803_003", "K201803_004", "K201803_005", "K201803_006", "K201803_007","K201803_008","K201803_009"),
label = c("Total_people", "Total With Disabilities",
"Hearing", "Vision difficulty",
"cognative", "ambulatory difficulty",
"Self-care difficulty","Independent living difficulty","No Disability")
)
# Join your data with the variable labels to get the descriptive names
df6_labeled <- df6 %>%
left_join(variables_df6, by = "variable")
# Reshape the data so that state names are rows and variable labels are columns
df6_wide <- df6_labeled %>%
pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
df6_wide
## # A tibble: 52 × 10
## NAME Total_people Total With Disabilit…¹ Hearing `Vision difficulty`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Alabama 4957633 808071 208028 152798
## 2 Alaska 702154 92390 33397 15748
## 3 Arizona 7174053 972252 298849 180792
## 4 Arkansas 2974701 517051 142133 105624
## 5 California 38724294 4324355 1140131 844049
## 6 Colorado 5715497 640346 211803 120570
## 7 Connecticut 3557526 427014 113490 78078
## 8 Delaware 987964 130551 37933 25335
## 9 District of … 659979 76754 14429 14569
## 10 Florida 21465883 2906367 812248 555361
## # ℹ 42 more rows
## # ℹ abbreviated name: ¹`Total With Disabilities`
## # ℹ 5 more variables: cognative <dbl>, `ambulatory difficulty` <dbl>,
## # `Self-care difficulty` <dbl>, `Independent living difficulty` <dbl>,
## # `No Disability` <dbl>
# df6_wide now has states as rows and the descriptive variable labels as columns
# Calculate percentages
df6_wide <- df6_wide %>%
mutate(DisabilityRate = `Total With Disabilities` / `Total_people` * 100)
# Plotting with ggplot2
library(ggplot2)
ggplot(df6_wide, aes(x = reorder(NAME, -DisabilityRate), y = DisabilityRate)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(x = "State", y = "Disability Rate (%)", title = "Disability Rates by State") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Select only the disability-related columns for correlation
disability_data <- df6_wide %>%
select(`Hearing`, `Vision difficulty`, `cognative`, `ambulatory difficulty`, `Self-care difficulty`, `Independent living difficulty`)
# Compute correlation matrix
correlation_matrix <- cor(disability_data, use = "complete.obs") # use = "complete.obs" handles missing values
# Check if corrplot is installed; if not, install it
if (!require(corrplot)) install.packages("corrplot")
library(corrplot)
corrplot(correlation_matrix, method = "circle")
# Comparative analysis of disabilities within a state (example: Iowa)
iowa_data <- df6_wide %>%
filter(NAME == "Iowa") %>%
select(`Hearing`, `Vision difficulty`, `cognative`, `ambulatory difficulty`, `Self-care difficulty`, `Independent living difficulty`)
# Plotting the data for Iowa
barplot(as.matrix(iowa_data), beside = TRUE, legend.text = TRUE, args.legend = list(x = "topright"))
# Predicting 'Ambulatory difficulty' based on other disabilities
lm_model <- lm(`ambulatory difficulty` ~ `Hearing` + `Vision difficulty` + `cognative` + `Self-care difficulty` + `Independent living difficulty`, data = df6_wide)
# Summary of the linear model
summary(lm_model)
##
## Call:
## lm(formula = `ambulatory difficulty` ~ Hearing + `Vision difficulty` +
## cognative + `Self-care difficulty` + `Independent living difficulty`,
## data = df6_wide)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56621 -11941 -2766 17524 73868
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8200.5856 5646.4977 -1.452 0.153197
## Hearing 0.6796 0.1793 3.791 0.000436 ***
## `Vision difficulty` 1.0070 0.1644 6.126 1.87e-07 ***
## cognative -0.5426 0.2291 -2.369 0.022116 *
## `Self-care difficulty` -1.9464 0.4434 -4.390 6.59e-05 ***
## `Independent living difficulty` 1.9228 0.3144 6.115 1.95e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24940 on 46 degrees of freedom
## Multiple R-squared: 0.9968, Adjusted R-squared: 0.9965
## F-statistic: 2909 on 5 and 46 DF, p-value: < 2.2e-16
lm_model
##
## Call:
## lm(formula = `ambulatory difficulty` ~ Hearing + `Vision difficulty` +
## cognative + `Self-care difficulty` + `Independent living difficulty`,
## data = df6_wide)
##
## Coefficients:
## (Intercept) Hearing
## -8200.5856 0.6796
## `Vision difficulty` cognative
## 1.0070 -0.5426
## `Self-care difficulty` `Independent living difficulty`
## -1.9464 1.9228
# K-means clustering
#Perform a clustering analysis to identify groups of states with similar disability profiles.
# K-means clustering with 3 centers
set.seed(123) # For reproducibility
clusters <- kmeans(disability_data, centers = 3)
# Add cluster assignment to the data
df6_wide$Cluster <- as.factor(clusters$cluster)
pca_res <- prcomp(disability_data, scale. = TRUE)
fviz_pca_biplot(pca_res, label = "none", col.ind = df6_wide$Cluster, addEllipses = TRUE)
#census_api_key(Sys.getenv("CENSUS_API_KEY"), install = TRUE)
#install.packages("tidycensus")
# Load necessary libraries
library(tidycensus)
library(dplyr)
library(tidyr)
#install.packages("factoextra")
library(factoextra)
#install.packages("ggplot2")
#install.packages("usmap")
#install.packages("maps")
library(ggplot2)
library(usmap)
library(maps)
merged_data <- df1_wide %>%
left_join(df2_wide, by = "NAME") %>%
left_join(df3_wide, by = "NAME")%>%
left_join(df4_wide, by = "NAME")%>%
left_join(df5_wide, by = "NAME")%>%
left_join(df6_wide, by = "NAME")
columns_to_drop <- c('Total.x', 'Total.y', 'Total.x.x', 'Total.y.y')
merged_data <- merged_data[, !(names(merged_data) %in% columns_to_drop)]
colnames(merged_data)
## [1] "NAME"
## [2] "In labor force:"
## [3] "Civilian labor force:"
## [4] "Employed"
## [5] "Unemployed"
## [6] "In Armed Forces"
## [7] "Not in labor force"
## [8] "EmploymentRate"
## [9] "UnemploymentRate"
## [10] "NotInLaborForceRate"
## [11] "state"
## [12] "Education_Total_students"
## [13] "Education_Below_9th grade"
## [14] "Education_9th to 12th grade_no diploma"
## [15] "Education_High_school_graduate"
## [16] "Education_Some college_no degree"
## [17] "Education_Associates_degree"
## [18] "Education_Bachelors_degree"
## [19] "Education_Graduate_professional degree"
## [20] "U.S. citizen"
## [21] "Not a U.S. citizen"
## [22] "Total_age"
## [23] "Age_under_18"
## [24] "Age_18_to_24"
## [25] "Age_25_to_34"
## [26] "Age_35_to_44"
## [27] "Age_45_to_54"
## [28] "Age_55_to_64"
## [29] "Age_over_64"
## [30] "Total"
## [31] "Owner Occupied"
## [32] "Renter Occupied"
## [33] "Total_people"
## [34] "Total With Disabilities"
## [35] "Hearing"
## [36] "Vision difficulty"
## [37] "cognative"
## [38] "ambulatory difficulty"
## [39] "Self-care difficulty"
## [40] "Independent living difficulty"
## [41] "No Disability"
## [42] "DisabilityRate"
## [43] "Cluster"
“Socio-Economic Factors Influencing Employment in the United States: A Comprehensive State-by-State Analysis”
This project aims to analyze the impact of various socio-economic factors such as education, age, citizenship status, housing, and disabilities on employment rates across different states in the United States. Using data from the American Community Survey (ACS) 2021, the study will employ statistical techniques to identify correlations, trends, and patterns, thereby providing insights into the multifaceted nature of employment dynamics in the US.
Data Exploration and Cleaning
Continue with data cleaning and handling missing values.
Normalize the data if required for better comparison.
#Check for Missing Values
# Checking the number of missing values in each column
summarize_missing_data <- function(data) {
sapply(data, function(x) sum(is.na(x)))
}
missing_data_summary <- summarize_missing_data(merged_data)
missing_data_summary
## NAME In labor force:
## 0 0
## Civilian labor force: Employed
## 0 0
## Unemployed In Armed Forces
## 0 0
## Not in labor force EmploymentRate
## 0 0
## UnemploymentRate NotInLaborForceRate
## 0 0
## state Education_Total_students
## 2 0
## Education_Below_9th grade Education_9th to 12th grade_no diploma
## 0 0
## Education_High_school_graduate Education_Some college_no degree
## 0 0
## Education_Associates_degree Education_Bachelors_degree
## 0 0
## Education_Graduate_professional degree U.S. citizen
## 0 1
## Not a U.S. citizen Total_age
## 1 0
## Age_under_18 Age_18_to_24
## 0 0
## Age_25_to_34 Age_35_to_44
## 0 0
## Age_45_to_54 Age_55_to_64
## 0 0
## Age_over_64 Total
## 0 0
## Owner Occupied Renter Occupied
## 0 0
## Total_people Total With Disabilities
## 0 0
## Hearing Vision difficulty
## 0 0
## cognative ambulatory difficulty
## 0 0
## Self-care difficulty Independent living difficulty
## 0 0
## No Disability DisabilityRate
## 0 0
## Cluster
## 0
# Handling missing values
handle_missing_values <- function(data) {
for (col in colnames(data)) {
if (is.numeric(data[[col]])) {
data[[col]] <- ifelse(is.na(data[[col]]), median(data[[col]], na.rm = TRUE), data[[col]])
}
# Add other conditions if needed for categorical data
}
return(data)
}
merged_data <- handle_missing_values(merged_data)
# Function to normalize only numeric columns
normalize_data <- function(data) {
for (col in colnames(data)) {
if (is.numeric(data[[col]])) {
min_val <- min(data[[col]], na.rm = TRUE)
max_val <- max(data[[col]], na.rm = TRUE)
data[[col]] <- (data[[col]] - min_val) / (max_val - min_val)
}
}
return(data)
}
# Apply the normalization function to the merged_data
normalized_data <- normalize_data(merged_data)
# Create a new dataset with only numeric columns
numeric_data <- normalized_data[sapply(normalized_data, is.numeric)]
# Compute the correlation matrix
correlation_matrix <- cor(numeric_data, use = "complete.obs")
# Use corrplot to visualize
library(corrplot)
corrplot(correlation_matrix, method = "circle")
# Example: Comparing Education and Employment Rates
ggplot(normalized_data, aes(x = `Education_Bachelors_degree`, y = EmploymentRate)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Relation between Bachelors Degree Holders and Employment Rate", x = "Percentage of Bachelor's Degree Holders", y = "Employment Rate")
## `geom_smooth()` using formula = 'y ~ x'
#Perform statistical tests (e.g., t-tests, ANOVA) to see if differences in socio-economic factors among states are statistically significant.
# Example: T-test for comparing employment rates in states with high vs low education levels
high_education_states <- normalized_data$`Education_Bachelors_degree` > median(normalized_data$`Education_Bachelors_degree`)
t.test(normalized_data$EmploymentRate ~ high_education_states)
##
## Welch Two Sample t-test
##
## data: normalized_data$EmploymentRate by high_education_states
## t = -0.687, df = 33.752, p-value = 0.4968
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
## -0.12973315 0.06419418
## sample estimates:
## mean in group FALSE mean in group TRUE
## 0.7029188 0.7356883
# Linear regression model as an example
model <- lm(EmploymentRate ~ ., data = normalized_data)
summary(model)
##
## Call:
## lm(formula = EmploymentRate ~ ., data = normalized_data)
##
## Residuals:
## ALL 50 residuals are 0: no residual degrees of freedom!
##
## Coefficients: (90 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.55528 NaN NaN NaN
## NAMEAlaska 0.11648 NaN NaN NaN
## NAMEArizona 0.09049 NaN NaN NaN
## NAMEArkansas 0.03227 NaN NaN NaN
## NAMECalifornia 0.12642 NaN NaN NaN
## NAMEColorado 0.34310 NaN NaN NaN
## NAMEConnecticut 0.24700 NaN NaN NaN
## NAMEDelaware 0.07616 NaN NaN NaN
## NAMEFlorida 0.04674 NaN NaN NaN
## NAMEGeorgia 0.16475 NaN NaN NaN
## NAMEHawaii 0.01727 NaN NaN NaN
## NAMEIdaho 0.23916 NaN NaN NaN
## NAMEIllinois 0.19745 NaN NaN NaN
## NAMEIndiana 0.22507 NaN NaN NaN
## NAMEIowa 0.34163 NaN NaN NaN
## NAMEKansas 0.28824 NaN NaN NaN
## NAMEKentucky 0.03643 NaN NaN NaN
## NAMELouisiana -0.02523 NaN NaN NaN
## NAMEMaine 0.14686 NaN NaN NaN
## NAMEMaryland 0.27979 NaN NaN NaN
## NAMEMassachusetts 0.29442 NaN NaN NaN
## NAMEMichigan 0.08970 NaN NaN NaN
## NAMEMinnesota 0.39230 NaN NaN NaN
## NAMEMississippi -0.06135 NaN NaN NaN
## NAMEMissouri 0.19036 NaN NaN NaN
## NAMEMontana 0.17200 NaN NaN NaN
## NAMENebraska 0.44472 NaN NaN NaN
## NAMENevada 0.08173 NaN NaN NaN
## NAMENew Hampshire 0.35974 NaN NaN NaN
## NAMENew Jersey 0.21986 NaN NaN NaN
## NAMENew Mexico -0.08934 NaN NaN NaN
## NAMENew York 0.10069 NaN NaN NaN
## NAMENorth Carolina 0.11695 NaN NaN NaN
## NAMENorth Dakota 0.38003 NaN NaN NaN
## NAMEOhio 0.17893 NaN NaN NaN
## NAMEOklahoma 0.09198 NaN NaN NaN
## NAMEOregon 0.12387 NaN NaN NaN
## NAMEPennsylvania 0.15425 NaN NaN NaN
## NAMERhode Island 0.20612 NaN NaN NaN
## NAMESouth Carolina 0.04952 NaN NaN NaN
## NAMESouth Dakota 0.36558 NaN NaN NaN
## NAMETennessee 0.12815 NaN NaN NaN
## NAMETexas 0.21828 NaN NaN NaN
## NAMEUtah 0.42911 NaN NaN NaN
## NAMEVermont 0.26235 NaN NaN NaN
## NAMEVirginia 0.22197 NaN NaN NaN
## NAMEWashington 0.18129 NaN NaN NaN
## NAMEWest Virginia -0.17854 NaN NaN NaN
## NAMEWisconsin 0.30928 NaN NaN NaN
## NAMEWyoming 0.29484 NaN NaN NaN
## `In labor force:` NA NA NA NA
## `Civilian labor force:` NA NA NA NA
## Employed NA NA NA NA
## Unemployed NA NA NA NA
## `In Armed Forces` NA NA NA NA
## `Not in labor force` NA NA NA NA
## UnemploymentRate NA NA NA NA
## NotInLaborForceRate NA NA NA NA
## stateAL NA NA NA NA
## stateAR NA NA NA NA
## stateAZ NA NA NA NA
## stateCA NA NA NA NA
## stateCO NA NA NA NA
## stateCT NA NA NA NA
## stateDE NA NA NA NA
## stateFL NA NA NA NA
## stateGA NA NA NA NA
## stateHI NA NA NA NA
## stateIA NA NA NA NA
## stateID NA NA NA NA
## stateIL NA NA NA NA
## stateIN NA NA NA NA
## stateKS NA NA NA NA
## stateKY NA NA NA NA
## stateLA NA NA NA NA
## stateMA NA NA NA NA
## stateMD NA NA NA NA
## stateME NA NA NA NA
## stateMI NA NA NA NA
## stateMN NA NA NA NA
## stateMO NA NA NA NA
## stateMS NA NA NA NA
## stateMT NA NA NA NA
## stateNC NA NA NA NA
## stateND NA NA NA NA
## stateNE NA NA NA NA
## stateNH NA NA NA NA
## stateNJ NA NA NA NA
## stateNM NA NA NA NA
## stateNV NA NA NA NA
## stateNY NA NA NA NA
## stateOH NA NA NA NA
## stateOK NA NA NA NA
## stateOR NA NA NA NA
## statePA NA NA NA NA
## stateRI NA NA NA NA
## stateSC NA NA NA NA
## stateSD NA NA NA NA
## stateTN NA NA NA NA
## stateTX NA NA NA NA
## stateUT NA NA NA NA
## stateVA NA NA NA NA
## stateVT NA NA NA NA
## stateWA NA NA NA NA
## stateWI NA NA NA NA
## stateWV NA NA NA NA
## stateWY NA NA NA NA
## Education_Total_students NA NA NA NA
## `Education_Below_9th grade` NA NA NA NA
## `Education_9th to 12th grade_no diploma` NA NA NA NA
## Education_High_school_graduate NA NA NA NA
## `Education_Some college_no degree` NA NA NA NA
## Education_Associates_degree NA NA NA NA
## Education_Bachelors_degree NA NA NA NA
## `Education_Graduate_professional degree` NA NA NA NA
## `U.S. citizen` NA NA NA NA
## `Not a U.S. citizen` NA NA NA NA
## Total_age NA NA NA NA
## Age_under_18 NA NA NA NA
## Age_18_to_24 NA NA NA NA
## Age_25_to_34 NA NA NA NA
## Age_35_to_44 NA NA NA NA
## Age_45_to_54 NA NA NA NA
## Age_55_to_64 NA NA NA NA
## Age_over_64 NA NA NA NA
## Total NA NA NA NA
## `Owner Occupied` NA NA NA NA
## `Renter Occupied` NA NA NA NA
## Total_people NA NA NA NA
## `Total With Disabilities` NA NA NA NA
## Hearing NA NA NA NA
## `Vision difficulty` NA NA NA NA
## cognative NA NA NA NA
## `ambulatory difficulty` NA NA NA NA
## `Self-care difficulty` NA NA NA NA
## `Independent living difficulty` NA NA NA NA
## `No Disability` NA NA NA NA
## DisabilityRate NA NA NA NA
## Cluster2 NA NA NA NA
## Cluster3 NA NA NA NA
##
## Residual standard error: NaN on 0 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 1, Adjusted R-squared: NaN
## F-statistic: NaN on 49 and 0 DF, p-value: NA